** This do file recreates the results summarized in Figure 1 of the paper

* The code uses aggregate census data calculated from the IPUMS International 
* Census Microdata Repository at https://international.ipums.org/international/

* set directory to folder with replication data
cd "~\Replication\"

*** Aggregating responses from online surveys

** chile
* age
use "chile2021web.dta", clear
gen edad = 1 if age <= 25
replace edad = 2 if age > 25 & age <= 35
replace edad = 3 if age > 35 & age <= 45
replace edad = 4 if age > 45 & age <= 55
replace edad = 5 if age > 55 & age <= 65
replace edad = 6 if age > 65 
lab define edad_lbl 1 "25 and below" 2 "26-35" 3 "36-45" 4 "46-55" 5 "56-65" 6 "66 and up"
lab value edad edad_lbl

egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(edad)
rename edad age
gen svy_pct = svy_cases / svy_total * 100

decode age, gen(category)
drop age
rename svy_pct blackbox
gen country = "chile"
gen variable = "age"
gen year = "2021"

save "fig1_blackbox_six.dta", replace

* sex
use "chile2021web.dta", clear
egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(gender)
gen sex = "Hombre" if gender == 1
replace sex = "Mujer" if gender == 2
gen svy_pct = svy_cases / svy_total * 100

rename sex category
drop gender
rename svy_pct blackbox
gen country = "chile"
gen variable = "sex"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* education
use "chile2021web.dta", clear
egen svy_total = count(complete)
*1 - "none or primary"
*2 - "secondary"
*3 - "post-secondary"
recode q10 (1 2 3 = 1)(4 5 = 2)(6 7 8 9 = 3), gen(education_level)
collapse (count) svy_cases=complete (first) svy_total, by(education_level)
lab define ed_level 1 "None or primary" 2 "Secondary" 3 "Post-secondary"
lab values education_level ed_level
gen svy_pct = svy_cases / svy_total * 100

decode education_level, gen(category)
drop education_level
rename svy_pct blackbox
gen country = "chile"
gen variable = "educ"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* prov
use "chile2021web.dta", clear
egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(cl_region)
gen svy_pct = svy_cases / svy_total * 100

rename cl_region category
rename svy_pct blackbox
gen country = "chile"
gen variable = "prov"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

** colombia
* age
use "colombia2021web.dta", clear
gen edad = 1 if age <= 25
replace edad = 2 if age > 25 & age <= 35
replace edad = 3 if age > 35 & age <= 45
replace edad = 4 if age > 45 & age <= 55
replace edad = 5 if age > 55 & age <= 65
replace edad = 6 if age > 65 
lab define edad_lbl 1 "25 and below" 2 "26-35" 3 "36-45" 4 "46-55" 5 "56-65" 6 "66 and up"
lab value edad edad_lbl

egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(edad)
rename edad age
gen svy_pct = svy_cases / svy_total * 100

decode age, gen(category)
drop age
rename svy_pct blackbox
gen country = "colombia"
gen variable = "age"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* sex
use "colombia2021web.dta", clear
egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(gender)
gen sex = "Hombre" if gender == 1
replace sex = "Mujer" if gender == 2
gen svy_pct = svy_cases / svy_total * 100

rename sex category
drop gender
rename svy_pct blackbox
gen country = "colombia"
gen variable = "sex"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* education
use "colombia2021web.dta", clear
egen svy_total = count(complete)
*1 - "none or primary"
*2 - "secondary"
*3 - "post-secondary"
recode q10 (1 2 3 = 1)(4 5 = 2)(6 7 8 9 = 3), gen(education_level)
collapse (count) svy_cases=complete (first) svy_total, by(education_level)
lab define ed_level 1 "None or primary" 2 "Secondary" 3 "Post-secondary"
lab values education_level ed_level
gen svy_pct = svy_cases / svy_total * 100

decode education_level, gen(category)
drop education_level
rename svy_pct blackbox
gen country = "colombia"
gen variable = "educ"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* prov
use "colombia2021web.dta", clear
drop if co_departamento == .
egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(co_departamento)
gen svy_pct = svy_cases / svy_total * 100

decode co_departamento, gen(category)
drop co_departamento
rename svy_pct blackbox
gen country = "colombia"
gen variable = "prov"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

** peru
* age
use "peru2021web.dta", clear
gen edad = 1 if age <= 25
replace edad = 2 if age > 25 & age <= 35
replace edad = 3 if age > 35 & age <= 45
replace edad = 4 if age > 45 & age <= 55
replace edad = 5 if age > 55 & age <= 65
replace edad = 6 if age > 65 
lab define edad_lbl 1 "25 and below" 2 "26-35" 3 "36-45" 4 "46-55" 5 "56-65" 6 "66 and up"
lab value edad edad_lbl

egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(edad)
rename edad age
gen svy_pct = svy_cases / svy_total * 100

decode age, gen(category)
drop age
rename svy_pct blackbox
gen country = "peru"
gen variable = "age"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* sex
use "peru2021web.dta", clear
egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(gender)
gen sex = "Hombre" if gender == 1
replace sex = "Mujer" if gender == 2
gen svy_pct = svy_cases / svy_total * 100

rename sex category
drop gender
rename svy_pct blackbox
gen country = "peru"
gen variable = "sex"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* education
use "peru2021web.dta", clear
egen svy_total = count(complete)
*0 - "none"
*1 - "primary"
*2 - "secondary"
*3 - "post-secondary"
recode q10 (1 = 0)(2 3 = 1)(4 5 = 2)(6 7 8 9 = 3), gen(education_level)
collapse (count) svy_cases=complete (first) svy_total, by(education_level)
lab define ed_level 0 "None" 1 "Primary" 2 "Secondary" 3 "Post-secondary"
lab values education_level ed_level
gen svy_pct = svy_cases / svy_total * 100

decode education_level, gen(category)
drop education_level
rename svy_pct blackbox
gen country = "peru"
gen variable = "educ"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* prov
use "peru2021web.dta", clear
egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(pe_departamento)
gen svy_pct = svy_cases / svy_total * 100

rename pe_departamento category
rename svy_pct blackbox
gen country = "peru"
gen variable = "prov"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

** mexico
* age
use "mexico2021web.dta", clear
gen edad = 1 if age <= 25
replace edad = 2 if age > 25 & age <= 35
replace edad = 3 if age > 35 & age <= 45
replace edad = 4 if age > 45 & age <= 55
replace edad = 5 if age > 55 & age <= 65
replace edad = 6 if age > 65 
lab define edad_lbl 1 "25 and below" 2 "26-35" 3 "36-45" 4 "46-55" 5 "56-65" 6 "66 and up"
lab value edad edad_lbl

egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(edad)
rename edad age
gen svy_pct = svy_cases / svy_total * 100

decode age, gen(category)
drop age
rename svy_pct blackbox
gen country = "mexico"
gen variable = "age"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* sex
use "mexico2021web.dta", clear
egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(gender)
gen sex = "Hombre" if gender == 1
replace sex = "Mujer" if gender == 2
gen svy_pct = svy_cases / svy_total * 100

rename sex category
drop gender
rename svy_pct blackbox
gen country = "mexico"
gen variable = "sex"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* education
use "mexico2021web.dta", clear
egen svy_total = count(complete)
*1 - "none"
*2 - "primary"
*3 - "secondary"
*4 - "post-secondary"
recode q10 (1 = 1)(2 3 = 2)(4 5 = 3)(6 7 8 9 = 4), gen(education_level)
collapse (count) svy_cases=complete (first) svy_total, by(education_level)
lab define ed_level 1 "None" 2 "Primary" 3 "Secondary" 4 "Post-secondary"
lab values education_level ed_level
gen svy_pct = svy_cases / svy_total * 100

decode education_level, gen(category)
drop education_level
rename svy_pct blackbox
gen country = "mexico"
gen variable = "educ"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* prov
use "mexico2021web.dta", clear
egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(mx_estado)
gen svy_pct = svy_cases / svy_total * 100

decode mx_estado, gen(category)
replace category = proper(category)
replace category = "Ciudad de México" if category == "Ciudad De MÉXico"
replace category = "Michoacán" if category == "MichoacÁN"
replace category = "México" if category == "MÉXico"
replace category = "Nuevo León" if category == "Nuevo LeÓN"
replace category = "Querétaro" if category == "QuerÉTaro"
replace category = "San Luis Potosí" if category == "San Luis PotosÍ"
replace category = "Yucatán" if category == "YucatÁN"
drop mx_estado
rename svy_pct blackbox
gen country = "mexico"
gen variable = "prov"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

** argentina
* age
use "argentina2021web.dta", clear
gen edad = 1 if age <= 25
replace edad = 2 if age > 25 & age <= 35
replace edad = 3 if age > 35 & age <= 45
replace edad = 4 if age > 45 & age <= 55
replace edad = 5 if age > 55 & age <= 65
replace edad = 6 if age > 65 
lab define edad_lbl 1 "25 and below" 2 "26-35" 3 "36-45" 4 "46-55" 5 "56-65" 6 "66 and up"
lab value edad edad_lbl

egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(edad)
rename edad age
gen svy_pct = svy_cases / svy_total * 100

decode age, gen(category)
drop age
rename svy_pct blackbox
gen country = "argentina"
gen variable = "age"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* sex
use "argentina2021web.dta", clear
egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(gender)
gen sex = "Hombre" if gender == 1
replace sex = "Mujer" if gender == 2
gen svy_pct = svy_cases / svy_total * 100

rename sex category
drop gender
rename svy_pct blackbox
gen country = "argentina"
gen variable = "sex"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* education
use "argentina2021web.dta", clear
egen svy_total = count(complete)
*0 - "none"
*1 - "Primary"
*2 - "secondary"
*3 - "post-secondary"
recode q10 (1 = 0)(2 3 = 1)(4 5 = 2)(6 7 8 9 = 3), gen(education_level)
collapse (count) svy_cases=complete (first) svy_total, by(education_level)
lab define ed_level 0 "None" 1 "Primary" 2 "Secondary" 3 "Post-secondary"
lab values education_level ed_level
gen svy_pct = svy_cases / svy_total * 100

decode education_level, gen(category)
drop education_level
rename svy_pct blackbox
gen country = "argentina"
gen variable = "educ"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* prov
use "argentina2021web.dta", clear
drop if ar_provincia == .
egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(ar_provincia)
gen svy_pct = svy_cases / svy_total * 100

decode ar_provincia, gen(category)
drop ar_provincia
rename svy_pct blackbox
gen country = "argentina"
gen variable = "prov"
replace category = strtrim(category)
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

** brazil
* age
use "brazil2021web.dta", clear
gen edad = 1 if age <= 25
replace edad = 2 if age > 25 & age <= 35
replace edad = 3 if age > 35 & age <= 45
replace edad = 4 if age > 45 & age <= 55
replace edad = 5 if age > 55 & age <= 65
replace edad = 6 if age > 65 
lab define edad_lbl 1 "25 and below" 2 "26-35" 3 "36-45" 4 "46-55" 5 "56-65" 6 "66 and up"
lab value edad edad_lbl

svy: tab edad, per
egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(edad)
rename edad age
gen svy_pct = svy_cases / svy_total * 100

decode age, gen(category)
drop age
rename svy_pct blackbox
gen country = "brazil"
gen variable = "age"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* sex
use "brazil2021web.dta", clear
svy: tab gender, per
egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(gender)
gen sex = "Hombre" if gender == 1
replace sex = "Mujer" if gender == 2
gen svy_pct = svy_cases / svy_total * 100

rename sex category
drop gender
rename svy_pct blackbox
gen country = "brazil"
gen variable = "sex"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* education
use "brazil2021web.dta", clear
egen svy_total = count(complete)
*0 - "none"
*1 - "primary"
*2 - "secondary"
*3 - "post-secondary"
recode q10 (1 = 0)(2 3 = 1)(4 5 = 2)(6 7 8 9 = 3), gen(education_level)
collapse (count) svy_cases=complete (first) svy_total, by(education_level)
lab define ed_level 0 "None" 1 "Primary" 2 "Secondary" 3 "Post-secondary"
lab values education_level ed_level
gen svy_pct = svy_cases / svy_total * 100

decode education_level, gen(category)
drop education_level
rename svy_pct blackbox
gen country = "brazil"
gen variable = "educ"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* prov
use "brazil2021web.dta", clear
drop if br_estado == ""
svy: tab br_estado, per
egen svy_total = count(complete)
collapse (count) svy_cases=complete (first) svy_total, by(br_estado)
gen svy_pct = svy_cases / svy_total * 100

rename br_estado category
rename svy_pct blackbox
gen country = "brazil"
gen variable = "prov"
gen year = "2021"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace


** brazil 2020
* sex
use br_blackbox.dta, clear
drop gender
rename p_sexo gender
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(gender)
gen sex = "Hombre" if gender == 1
replace sex = "Mujer" if gender == 2
gen svy_pct = svy_cases / svy_total * 100

rename sex category
drop gender
rename svy_pct blackbox
gen country = "brazil"
gen variable = "sex"
gen year = "2020"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* edu
use br_blackbox.dta, clear
drop edr
recode br_education_level (1 2 3=1)(4=2)(5 6 7 8=3), gen(edr)
lab define ed_level 0 "None" 1 "Primary" 2 "Secondary" 3 "Post-secondary" 4 "DK/NA"
lab values edr ed_level

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edr)
lab values edr ed_level
gen svy_pct = svy_cases / svy_total * 100

decode edr, gen(category)
drop edr
rename svy_pct blackbox
gen country = "brazil"
gen variable = "educ"
gen year = "2020"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* prov
use br_blackbox.dta, clear
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(int_estado)
gen svy_pct = svy_cases / svy_total * 100

decode int_estado,  gen(category)
drop int_estado
rename svy_pct blackbox
gen country = "brazil"
gen variable = "prov"
gen year = "2020"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* age
use br_blackbox.dta, clear
drop edad
gen edad = 1 if panelistage <= 25
replace edad = 2 if panelistage > 25 & panelistage <= 35
replace edad = 3 if panelistage > 35 & panelistage <= 45
replace edad = 4 if panelistage > 45 & panelistage <= 55
replace edad = 5 if panelistage > 55 & panelistage <= 65
replace edad = 6 if panelistage > 65 
lab value edad edad_lbl

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edad)
rename edad age
gen svy_pct = svy_cases / svy_total * 100

decode age, gen(category)
drop age
rename svy_pct blackbox
gen country = "brazil"
gen variable = "age"
gen year = "2020"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* household size
use br_blackbox.dta, clear
recode q144 (4=1)(5=2)(7=3)(8=4)(9=5)(10=6)(11=7)(12=8)(13=9)(14=10) , gen(house_size) 
svy: tab house_size
gen blackbox = 1
drop if house_size == .
collapse (percent) blackbox, by(house_size)
gen country = "brazil"
gen comparison_to = "pnadc2017"
tostring house_size, replace
rename house_size category
gen variable = "house_size"
save "fig1_benchmark_three.dta", replace

* number of rooms
use br_blackbox.dta, clear
rename q147 num_rooms
svy: tab num_rooms
gen blackbox = 1
drop if num_rooms == .
collapse (percent) blackbox, by(num_rooms)
gen country = "brazil"
gen comparison_to = "pnadc2017"
tostring num_rooms, replace
rename num_rooms category
gen variable = "num_rooms"
append using "fig1_benchmark_three.dta"
save "fig1_benchmark_three.dta", replace

* home ownership
use br_blackbox.dta, clear
gen blackbox = 1
drop if q145 == .
label define q145 1 "Próprio de algum morador - já pago" 2 "Próprio de algum morador - ainda pagando" 3 "Alugado" 4 "Cedido por empregador" 5 "Cedido por familiar" 6 "Cedido de outra forma" 7 "Outra condição"
label values q145 q145
svy: tab q145
collapse (percent) blackbox, by(q145)
gen country = "brazil"
gen comparison_to = "pnadc2017"
decode q145, gen(q145_str)
rename q145_str category
drop q145
gen variable = "homeowner"
append using "fig1_benchmark_three.dta"
save "fig1_benchmark_three.dta", replace

* employment
use br_blackbox.dta, clear
gen employed = 1 if q146 == 1
replace employed = 0 if inlist(q146, 2, 3)
label define employed 1 "Sim" 0 "Não"
label values employed employed
gen blackbox = 1
drop if employed == .
svy: tab employed
collapse (percent) blackbox, by(employed)
gen country = "brazil"
gen comparison_to = "pnadc2017"
decode employed, gen(employed_str)
rename employed_str category
drop employed
gen variable = "employment"
append using "fig1_benchmark_three.dta"
save "fig1_benchmark_three.dta", replace

** argentina 2020
* sex
use ar_blackbox.dta, clear
drop gender
rename p_sexo gender
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(gender)
gen sex = "Hombre" if gender == 1
replace sex = "Mujer" if gender == 2
gen svy_pct = svy_cases / svy_total * 100

rename sex category
drop gender
rename svy_pct blackbox
gen country = "argentina"
gen variable = "sex"
gen year = "2020"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* education
use ar_blackbox.dta, clear
drop edr
recode ar_education_level (1=0)(2 3=1)(4 5=2)(6 7 8 9 10=3)(11=4), gen(edr)
lab define ed_level 0 "None" 1 "Primary" 2 "Secondary" 3 "Post-secondary" 4 "DK/NA"
lab values edr ed_level

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edr)
lab values edr ed_level
gen svy_pct = svy_cases / svy_total * 100

decode edr, gen(category)
drop edr
rename svy_pct blackbox
gen country = "argentina"
gen variable = "educ"
gen year = "2020"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* age 
use ar_blackbox.dta, clear
drop edad
gen edad = 1 if panelistage <= 25
replace edad = 2 if panelistage > 25 & panelistage <= 35
replace edad = 3 if panelistage > 35 & panelistage <= 45
replace edad = 4 if panelistage > 45 & panelistage <= 55
replace edad = 5 if panelistage > 55 & panelistage <= 65
replace edad = 6 if panelistage > 65 
lab value edad edad_lbl

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edad)
rename edad age
gen svy_pct = svy_cases / svy_total * 100

decode age, gen(category)
drop age
rename svy_pct blackbox
gen country = "argentina"
gen variable = "age"
gen year = "2020"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* prov
use ar_blackbox.dta, clear
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(ar_provincia)
gen svy_pct = svy_cases / svy_total * 100

decode ar_provincia,  gen(category)
drop ar_provincia
rename svy_pct blackbox
gen country = "argentina"
gen variable = "prov"
gen year = "2020"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* household size
use ar_blackbox.dta, clear
recode q402 (1=1)(4=2)(5=3)(6=4)(7=5)(8=6)(9=7)(10=8)(11=9)(12=10) , gen(house_size) 
svy: tab house_size
gen blackbox = 1
drop if house_size == .
collapse (percent) blackbox, by(house_size)
gen country = "argentina"
gen comparison_to = "eph2018"
tostring house_size, replace
rename house_size category
gen variable = "house_size"
append using "fig1_benchmark_three.dta"
save "fig1_benchmark_three.dta", replace

* number of rooms
use ar_blackbox.dta, clear
recode q408 (4=1)(7=2)(8=3)(9=4)(10=5)(11=6)(12=7)(13=8), gen(num_rooms)
svy: tab num_rooms
gen blackbox = 1
drop if num_rooms == .
collapse (percent) blackbox, by(num_rooms)
gen country = "argentina"
gen comparison_to = "eph2018"
tostring num_rooms, replace
rename num_rooms category
gen variable = "num_rooms"
append using "fig1_benchmark_three.dta"
save "fig1_benchmark_three.dta", replace

* employment (1 = Yes, 2 = No)
use ar_blackbox.dta, clear
rename q398 trabajo_sp
label define trabajo_sp 1 "Si" 2 "No"
label values trabajo_sp trabajo_sp
* (1=Yes, 2=No)
svy: tab trabajo_sp
gen blackbox = 1
drop if trabajo_sp == .
collapse (percent) blackbox, by(trabajo_sp)
gen country = "argentina"
gen comparison_to = "eph2018"
decode trabajo_sp, gen(trabajo_sp_str)
rename trabajo_sp_str category
drop trabajo_sp
gen variable = "employment"
append using "fig1_benchmark_three.dta"
save "fig1_benchmark_three.dta", replace


* pension
use ar_blackbox.dta, clear
gen pension = 1 if inlist(q409, "1", "1,2", "1,2,3", "1,3")
replace pension = 0 if !inlist(q409, "1", "1,2", "1,2,3", "1,3")
label define pension 1 "jubilación o pension" 0 "no jubilación o pension"
label values pension pension
svy: tab pension
gen blackbox = 1
drop if pension == .
collapse (percent) blackbox, by(pension)
gen country = "argentina"
gen comparison_to = "eph2018"
decode pension, gen(pension_str)
rename pension_str category
drop pension
gen variable = "govassist_retired"
append using "fig1_benchmark_three.dta"
save "fig1_benchmark_three.dta", replace

* pension
use ar_blackbox.dta, clear
gen subsidy = 1 if inlist(q409, "1,2", "1,2,3", "2", "2,3")
replace subsidy = 0 if !inlist(q409, "1,2", "1,2,3", "2", "2,3")
label define subsidy 1 "subsidio o ayuda social " 0 "no subsidio o ayuda social"
label values subsidy subsidy
svy: tab subsidy
gen blackbox = 1
drop if subsidy == .
collapse (percent) blackbox, by(subsidy)
gen country = "argentina"
gen comparison_to = "eph2018"
decode subsidy, gen(subsidy_str)
rename subsidy_str category
drop subsidy
gen variable = "govassist_subsidy"
append using "fig1_benchmark_three.dta"
save "fig1_benchmark_three.dta", replace

* good from govt or church
use ar_blackbox.dta, clear
gen social_goods = 1 if inlist(q409, "1,2,3", "1,3", "2,3", "3")
replace social_goods = 0 if !inlist(q409, "1,2,3", "1,3", "2,3", "3")
label define social_goods 1 "mercaderías, ropa, alimentos" 0 "no mercaderías, ropa, alimentos"
label values social_goods social_goods
svy: tab social_goods
gen blackbox = 1
drop if social_goods == .
collapse (percent) blackbox, by(social_goods)
gen country = "argentina"
gen comparison_to = "eph2018"
decode social_goods, gen(social_goods_str)
rename social_goods_str category
drop social_goods
gen variable = "govassist_goods"
append using "fig1_benchmark_three.dta"
save "fig1_benchmark_three.dta", replace


** mexico 2020
* sex
use mx_blackbox.dta, clear
drop gender
rename p_sexo gender
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(gender)
gen sex = "Hombre" if gender == 1
replace sex = "Mujer" if gender == 2
gen svy_pct = svy_cases / svy_total * 100

rename sex category
drop gender
rename svy_pct blackbox
gen country = "mexico"
gen variable = "sex"
gen year = "2020"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* education
use mx_blackbox.dta, clear
drop edr
recode mx_education_level (1=0)(2 3=1)(4 5 6 7 8 9=2)(10 11 12 13=3), gen(edr)
lab define ed_level 0 "None" 1 "Primary" 2 "Secondary" 3 "Post-secondary" 4 "DK/NA"
lab values edr ed_level

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edr)
lab values edr ed_level
gen svy_pct = svy_cases / svy_total * 100

decode edr, gen(category)
drop edr
rename svy_pct blackbox
gen country = "mexico"
gen variable = "educ"
gen year = "2020"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* age 
use mx_blackbox.dta, clear
drop edad
gen edad = 1 if panelistage <= 25
replace edad = 2 if panelistage > 25 & panelistage <= 35
replace edad = 3 if panelistage > 35 & panelistage <= 45
replace edad = 4 if panelistage > 45 & panelistage <= 55
replace edad = 5 if panelistage > 55 & panelistage <= 65
replace edad = 6 if panelistage > 65 
lab value edad edad_lbl

egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(edad)
rename edad age
gen svy_pct = svy_cases / svy_total * 100

decode age, gen(category)
drop age
rename svy_pct blackbox
gen country = "mexico"
gen variable = "age"
gen year = "2020"
append using "fig1_blackbox_six.dta"
save "fig1_blackbox_six.dta", replace

* prov
use mx_blackbox.dta, clear
egen svy_total = count(finished)
collapse (count) svy_cases=finished (first) svy_total, by(mx_int_estado)
gen svy_pct = svy_cases / svy_total * 100

decode mx_int_estado,  gen(category)
replace category = proper(category)
replace category = "Ciudad de México" if category == "Ciudad De MÉXico"
replace category = "Michoacán" if category == "MichoacÁN"
replace category = "México" if category == "MÉXico"
replace category = "Nuevo León" if category == "Nuevo LeÓN"
replace category = "Querétaro" if category == "QuerÉTaro"
replace category = "San Luis Potosí" if category == "San Luis PotosÍ"
replace category = "Yucatán" if category == "YucatÁN"
drop mx_int_estado
rename svy_pct blackbox
gen country = "mexico"
gen variable = "prov"
gen year = "2020"
append using "fig1_blackbox_six.dta"
replace country = country + " " + year
destring year, replace
save "fig1_blackbox_six.dta", replace

* household size
use mx_blackbox.dta, clear
recode q138 (1=1)(4=2)(5=3)(6=4)(7=5)(8=6)(9=7)(10=8)(11=9)(12=10) , gen(house_size) 
svy: tab house_size
gen blackbox = 1
drop if house_size == .
collapse (percent) blackbox, by(house_size)
gen country = "mexico"
gen comparison_to = "enh2017"
tostring house_size, replace
rename house_size category
gen variable = "house_size"
append using "fig1_benchmark_three.dta"
save "fig1_benchmark_three.dta", replace

* number of rooms
use mx_blackbox.dta, clear
recode v273 (4=1)(7=2)(8=3)(9=4)(10=5)(11=6)(12=7)(13=8), gen(num_rooms)
svy: tab num_rooms
gen blackbox = 1
drop if num_rooms == .
collapse (percent) blackbox, by(num_rooms)
gen country = "mexico"
gen comparison_to = "enh2017"
tostring num_rooms, replace
rename num_rooms category
gen variable = "num_rooms"
append using "fig1_benchmark_three.dta"
save "fig1_benchmark_three.dta", replace

* home ownership 
use mx_blackbox.dta, clear
gen blackbox = 1
drop if q140 == .
label define q140 1 "es rentada" 2 "es prestada" 3 "es propia pero la están pagando" 4 "es propia" 5 "esta en litigio" 6 "otra situación"
label values q140 q140
svy: tab q140
collapse (percent) blackbox, by(q140)
gen country = "mexico"
gen comparison_to = "enh2017"
decode q140, gen(q140_str)
rename q140_str category
drop q140
gen variable = "homeowner"
append using "fig1_benchmark_three.dta"
save "fig1_benchmark_three.dta", replace

* employment (1 = Yes, 2 = No)
use mx_blackbox.dta, clear
rename q141 trabajo_sp
label define trabajo_sp 1 "Si" 2 "No"
label values trabajo_sp trabajo_sp
* (1=Yes, 2=No)
svy: tab trabajo_sp
gen blackbox = 1
drop if trabajo_sp == .
collapse (percent) blackbox, by(trabajo_sp)
gen country = "mexico"
gen comparison_to = "enh2017"
decode trabajo_sp, gen(trabajo_sp_str)
rename trabajo_sp_str category
drop trabajo_sp
gen variable = "employment"
append using "fig1_benchmark_three.dta"
save "fig1_benchmark_three.dta", replace


*** Merge aggregate census data and calculate differences (MAEs)
import delimited "census_aggregates.csv", clear 
merge 1:1 country variable category using fig1_blackbox_six.dta
* drop missing categories from online surveys and assign blackbox percentage to 0 for unmatched census aggregates
drop if _merge == 2
replace blackbox = 0 if _merge == 1
drop _merge

* create pooled category
expand 2, gen(duplicate)
replace country = "Pooled" if duplicate == 1

replace country = strproper(country)
* calculate absolute differences
gen abs_error_netquest = abs(population - blackbox)

* keep only response category that is modal response in benchamrk data
keep if modal == 1

summ abs_error_netquest if country != "Pooled", det

* bar graph demographics
graph bar abs_error_netquest, over(country, label(angle(45))) scheme(plotplain) scale(1.5) fxsize(140) ///
ytitle("Mean absolute error") ylab(0 "0" 2 "2" 4 "4" 6 "6" 8 "8" 10 "10" 12 "12" 14 "14" 16 "16") ///
title("Census Questions") saving(demo_errors, replace)

*** Merge aggregate benchmark survey data and calculate differences (MAEs)
import delimited "benchmark_surveys_aggregates.csv", clear 
merge 1:1 country variable category using fig1_benchmark_three.dta, nogen

* create pooled category
expand 2, gen(duplicate)
replace country = "Pooled" if duplicate == 1

replace country = strproper(country)
gen abs_error_netquest = abs(population - blackbox)
* keep only response category that is modal response in benchamrk data
keep if modal == 1

replace country = "Argentina 2020" if country == "Argentina"
replace country = "Brazil 2020" if country == "Brazil"
replace country = "Mexico 2020" if country == "Mexico"
* bar graph
graph bar abs_error_netquest if comparison_to != "ipums", over(country, label(angle(45))) scheme(plotplain) fxsize(80) ///
ytitle("Mean absolute error") ylab(0 "0" 2 "2" 4 "4" 6 "6" 8 "8" 10 "10" 12 "12" 14 "14" 16 "16") scale(1.5) ///
title("Benchmark Questions") saving(bench_errors, replace)

* combine into figure 1
graph combine bench_errors.gph demo_errors.gph, row(1) scheme(plotplain)